/*
MIT License

Copyright (c) 2021 МГТУ им. Н.Э. Баумана, кафедра ИУ-6, Михаил Фетисов,

https://bmstu.codes/lsx/simodo
*/

#include "simodo/inout/token/Tokenizer.h"

#include <cassert>


namespace
{
    std::vector<simodo::inout::Tokenizer::_NumberMask> makeInnerMask(std::vector<simodo::inout::NumberMask> mask_set)
    {
        std::vector<simodo::inout::Tokenizer::_NumberMask> result;

        for (const simodo::inout::NumberMask & mask : mask_set) {
            if (mask.chars == simodo::inout::BUILDING_NUMBER)
            {
                int sh = result.size();

    /* 00 */    result.push_back({u"N", mask.type, simodo::inout::TokenQualification::Integer, mask.system, {sh+1,sh+3,sh+4}, true, true});
    /* 01 */    result.push_back({u".", mask.type, simodo::inout::TokenQualification::RealNumber, mask.system, {sh+2,sh+3,sh+4}, false, true});
    /* 02 */    result.push_back({u"N", mask.type, simodo::inout::TokenQualification::RealNumber, mask.system, {sh+3,sh+4}, false, true});
    /* 03 */    result.push_back({u"e", mask.type, simodo::inout::TokenQualification::RealNumber, mask.system, {sh+5,sh+6,sh+7}, false, false});
    /* 04 */    result.push_back({u"E", mask.type, simodo::inout::TokenQualification::RealNumber, mask.system, {sh+5,sh+6,sh+7}, false, false});
    /* 05 */    result.push_back({u"+", mask.type, simodo::inout::TokenQualification::RealNumber, mask.system, {sh+7}, false, false});
    /* 06 */    result.push_back({u"-", mask.type, simodo::inout::TokenQualification::RealNumber, mask.system, {sh+7}, false, false});
    /* 07 */    result.push_back({u"N", mask.type, simodo::inout::TokenQualification::RealNumber, mask.system, {}, false, true});
            }
            else
                result.push_back({mask.chars, mask.type, simodo::inout::TokenQualification::Integer, mask.system, {}, true, true});
        }

        return result;
    }
}

namespace simodo::inout
{

Tokenizer::Tokenizer(uri_index_t uri_index,
                     InputStream_interface &input_stream,
                     const LexicalParameters & parameters,
                     context_index_t context_no)
    : _scanner(uri_index,input_stream,context_no)
    , _param(parameters)
{
    assert(!_param.digits.empty());
    assert(!_param.latin_alphabet.empty());
    assert(_param.latin_alphabet.size()%2 == 0);
    assert(_param.national_alphabet.size()%2 == 0);

    /// \note Для отладки
    // parameters.masks.push_back({ u"0[1[2]][{3[4]|5[6]}[{{7}[8]}]9]", LexemeType::Number, 10 });
    // parameters.masks.push_back({ u"123{4|5}", LexemeType::Number, 10 });

    _numbers = makeInnerMask(parameters.masks);
}

Token Tokenizer::getToken()
{
    Token t = getAnyToken();

    while(t.type() == LexemeType::Comment)
        t = getAnyToken();

    return t;
}

Token Tokenizer::getAnyToken()
{
    // Конец потока
    if (_scanner.eof()) {
        return { LexemeType::Empty, u"", _scanner.makeTokenLocation(), TokenQualification::None, _scanner.context() };
    }

    // Проверяем контекст
    if (_scanner.context() != NO_TOKEN_CONTEXT_INDEX)
    {
        context_index_t context_index = _scanner.context();

        _scanner.fixLocation(context_index);
        return scanMarkup(context_index);
    }

    // Пропускаем пробелы
    passBlanks();

    // Фиксируем координаты начала токена
    _scanner.fixLocation();

    if (_scanner.eof()) {
        return { LexemeType::Empty, u"", _scanner.makeTokenLocation() };
    }

    // Символ новой строки может быть заменён на спец символ
    if (!_param.nl_substitution.empty() && _scanner.getChar() == '\n') {
        _scanner.shift(1);
        return { LexemeType::NewLine, _param.nl_substitution, _scanner.makeTokenLocation() };
    }

    // Маркированный текст
    for(size_t i=0; i < _param.markups.size(); ++i) {
        const MarkupSymbol & mus = _param.markups[i];

        if (_scanner.startsWith(mus.start))
            return scanMarkup(static_cast<uint32_t>(i));
    }

    // Слово (переменная, идентификатор и пр.)
    if (_scanner.startsWithAnyOf(_param.id_extra_symbols))
        return scanWord(NationalCharAffiliation::Extra);

    // Слово (переменная, идентификатор и пр.)
    if (_scanner.startsWithAnyOf(_param.latin_alphabet))
        return scanWord(NationalCharAffiliation::Latin);

    // Слово (переменная, идентификатор и пр.)
    if (_scanner.startsWithAnyOf(_param.national_alphabet))
        return scanWord(NationalCharAffiliation::National);

    // Число?
    {
        LexemeType          type;
        TokenQualification  qualification;
        std::u16string      lexeme_str;

        if (scanNumber(type,qualification,lexeme_str))
            return { type, lexeme_str, _scanner.makeTokenLocation(), qualification };
    }

    // Многосимвольная пунктуация (не ключевое слово)
    for(const std::u16string & s : _param.punctuation_words)
        if (_scanner.startsWith(s))
        {
            if (s == _param.eof_symbol)
                _scanner.setEOF();

            _scanner.shift(s.size());

            return { LexemeType::Punctuation, s, _scanner.makeTokenLocation() };
        }

    // Односимвольная пунктуация
    if (_scanner.startsWithAnyOf(_param.punctuation_chars))
    {
        std::u16string s;
        s.assign(1, _scanner.getFirstChar());

        if (s == _param.eof_symbol)
            _scanner.setEOF();

        _scanner.shift(1);

        return { LexemeType::Punctuation, s, _scanner.makeTokenLocation() };
    }

    // Что-то неизвестное, т.е. ошибка
    std::u16string lexeme_str;

    lexeme_str += _scanner.getFirstChar();
    _scanner.shift(1);

    return { LexemeType::Error, lexeme_str, _scanner.makeTokenLocation(), TokenQualification::UnknownCharacterSet };
}

void Tokenizer::passBlanks()
{
    while(!_scanner.eof())
    {
        if (!_param.nl_substitution.empty() && _scanner.getChar() == '\n')
            break;
        if (!isBlank(_scanner.getChar()))
            break;

        _scanner.shift(1);
    }
}

Token Tokenizer::scanMarkup(context_index_t context)
{
    assert(context < _param.markups.size());

    std::u16string lexeme_str;
    std::u16string token_str;

    const MarkupSymbol & mus = _param.markups[context];

    if (_scanner.context() != context)
    {
        _scanner.shift(mus.start.size());
        token_str += mus.start;
    }

    while(!_scanner.eof())
    {
        if (_scanner.startsWith(mus.ignore_sign))
        {
            token_str += mus.ignore_sign;
            _scanner.shift(mus.ignore_sign.size());

            if (_scanner.eof())
                break;

            token_str += _scanner.getFirstChar();
            lexeme_str += _scanner.getFirstChar();
            _scanner.shift(1);
        }
        else
        {
            if (mus.end.empty())
            {
                if (_scanner.getFirstChar() == u'\n')
                    break;
            }
            else if (_scanner.startsWith(mus.end))
            {
                token_str += mus.end;
                _scanner.shift(mus.end.size());
                context = NO_TOKEN_CONTEXT_INDEX;
                break;
            }

            token_str += _scanner.getFirstChar();
            lexeme_str += _scanner.getFirstChar();
            _scanner.shift(1);
        }
    }

    if (mus.end.empty())
        context = NO_TOKEN_CONTEXT_INDEX;

    TokenLocation loc = _scanner.makeTokenLocation();

    _scanner.fixLocation(context);

    return { {lexeme_str, mus.type}, token_str, loc, TokenQualification::None, context };
}

Token Tokenizer::scanWord(Tokenizer::NationalCharAffiliation first_char)
{
    bool    has_latin    = (first_char == NationalCharAffiliation::Latin);
    bool    has_national = (first_char == NationalCharAffiliation::National);

    std::u16string lexeme_str;

    lexeme_str += _scanner.getFirstChar();
    _scanner.shift(1);

    // Формируем лексему
    while(!_scanner.eof())
    {
        if (_scanner.startsWithAnyOf(_param.id_extra_symbols))
            ;
        else if (_scanner.startsWithAnyOf(_param.latin_alphabet))
            has_latin = true;
        else if (_scanner.startsWithAnyOf(_param.national_alphabet))
            has_national = true;
        else if (!_scanner.startsWithAnyOf(_param.digits))
            break;

        lexeme_str += _scanner.getFirstChar();
        _scanner.shift(1);
    }

    // Многосимвольная пунктуация
    for(const std::u16string & s : _param.punctuation_words)
    {
        bool is_find;

        if (_param.is_case_sensitive)
            is_find = (s == lexeme_str);
        else
            is_find = (s == convertToUpper(lexeme_str));

        if (is_find)
        {
            if (s == _param.eof_symbol)
                _scanner.setEOF();

            if (_param.is_case_sensitive)
                return { {lexeme_str, LexemeType::Punctuation}, s, _scanner.makeTokenLocation(), TokenQualification::Keyword };

            return { LexemeType::Punctuation, lexeme_str, _scanner.makeTokenLocation(), TokenQualification::Keyword };
        }
    }

    // Односимвольная пунктуация
    if (lexeme_str.size() == 1)
        if (_param.punctuation_chars.find(*lexeme_str.c_str()) != std::u16string::npos)
            return { LexemeType::Punctuation, lexeme_str, _scanner.makeTokenLocation(), TokenQualification::Keyword};

    if (has_national)
    {
        if (!_param.may_national_letters_use)
            return { LexemeType::Error, lexeme_str, _scanner.makeTokenLocation(), TokenQualification::NationalCharacterUse };

        if (has_latin)
            return { _param.may_national_letters_mix ? LexemeType::Id : LexemeType::Error,
                     lexeme_str, _scanner.makeTokenLocation(), TokenQualification::NationalCharacterMix };
    }

    return { LexemeType::Id, lexeme_str, _scanner.makeTokenLocation() };
}

bool Tokenizer::scanNumber(LexemeType &type, TokenQualification &qualification, std::u16string &lexeme_str)
{
    for(size_t i_starting=0; i_starting < _numbers.size(); ++i_starting)
    {
        if (_numbers[i_starting].is_starting)
        {
            lexeme_str.clear();

            size_t  i_mask_index = i_starting;
            size_t  i_mask_char  = 0;
            size_t  i_input      = 0;
            int16_t N_count      = -1;

            while(true)
            {
                const _NumberMask & mask    = _numbers[i_mask_index];
                char16_t            ch      = _scanner.getChar(i_input);

                if (i_mask_char == mask.chars.size())
                {
                    if (lexeme_str.empty())
                        return false;

                    if (mask.chars[i_mask_char-1] == u'n')
                    {
                        char16_t ch_upper;
                        if (mask.system > 10 )
                            ch_upper = convertLatinToUpper(ch);
                        else
                            ch_upper = ch;

                        if (_param.digits.find(ch_upper) < mask.system)
                            break;
                    }

                    if (!mask.refs.empty())
                    {
                        size_t i_ref=0;
                        for(; i_ref < mask.refs.size(); ++i_ref)
                        {
                            uint8_t ref_no = mask.refs[i_ref];

                            assert(static_cast<size_t>(ref_no) < _numbers.size());

                            const _NumberMask & ref_mask = _numbers[static_cast<size_t>(ref_no)];

                            assert(!ref_mask.chars.empty());

                            char16_t ch_upper;

                            if (mask.system > 10 )
                                ch_upper = convertLatinToUpper(ch);
                            else
                                ch_upper = ch;

                            if (ch == ref_mask.chars[0]
                             || ((ref_mask.chars[0] == u'N' || ref_mask.chars[0] == u'n') && _param.digits.find(ch_upper) < mask.system))
                                break;
                        }

                        if (i_ref < mask.refs.size())
                        {
                            i_mask_index = static_cast<size_t>(mask.refs[i_ref]);
                            i_mask_char = 0;
                            continue;
                        }

                        if (!mask.may_final)
                            break;
                    }

                    type = mask.type;
                    qualification = mask.qualification;
                    _scanner.shift(lexeme_str.size());
                    return true;
                }

                if (mask.chars[i_mask_char] == u'N' || mask.chars[i_mask_char] == u'n')
                {
                    N_count ++;

                    char16_t ch_upper;
                    if (mask.system > 10 )
                        ch_upper = convertLatinToUpper(ch);
                    else
                        ch_upper = ch;

                    if (_param.digits.find(ch_upper) < mask.system)
                    {
                        lexeme_str += ch;
                        if (mask.chars[i_mask_char] == u'n')
                            i_mask_char ++;
                    }
                    else if (N_count == 0)
                        break;
                    else if (mask.chars[i_mask_char] == u'n')
                        break;
                    else
                    {
                        i_mask_char ++;
                        continue;
                    }
                }
                else if (mask.chars[i_mask_char] == ch)
                {
                    lexeme_str += ch;
                    i_mask_char ++;
                    N_count = -1;
                }
                else
                    break;

                i_input ++ ;
            }
        }
        // don't remove!
        else
            break;
    }

    if (!lexeme_str.empty())
    {
        _scanner.shift(lexeme_str.size());
        type = LexemeType::Error;
        qualification = TokenQualification::NotANumber;
        return true;
    }

    return false;
}

std::u16string Tokenizer::convertToUpper(std::u16string s) const
{
    std::u16string res;

    for(char16_t c : s)
        res += convertToUpper(c);

    return res;
}

char16_t Tokenizer::convertToUpper(char16_t ch) const
{
    std::string::size_type  pos_latin       = _param.latin_alphabet.find(ch);
    size_t                  latin_size      = _param.latin_alphabet.size();
    size_t                  national_size   = _param.national_alphabet.size();

    if (pos_latin != std::string::npos)
    {
        if (pos_latin < latin_size/2)
            return _param.latin_alphabet.at(latin_size/2+pos_latin);

        return ch;
    }

    std::string::size_type pos_national = _param.national_alphabet.find(ch);

    if (pos_national != std::string::npos)
        if (pos_national < national_size/2)
            return _param.national_alphabet.at(national_size/2+pos_national);

    return ch;
}

char16_t Tokenizer::convertLatinToUpper(char16_t ch) const
{
    std::string::size_type  pos_latin       = _param.latin_alphabet.find(ch);
    size_t                  latin_size      = _param.latin_alphabet.size();

    if (pos_latin != std::string::npos)
        if (pos_latin < latin_size/2)
            return _param.latin_alphabet.at(latin_size/2+pos_latin);

    return ch;
}

bool Tokenizer::isBlank(char16_t ch) const
{
    return (std::u16string::npos != std::u16string(u" \t\r\n").find(ch));
}

}